Great Learning : AIML Online Capstone -AUTOMATIC TICKET ASSIGNMENT
DecA : Group 4: NLP 1
Group Members :
from google.colab import drive
drive.mount('/content/drive')
!pip install ftfy
from time import time
from PIL import Image
from zipfile import ZipFile
import os, sys, itertools, re
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot, plot
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix, classification_report
import sklearn.neighbors._base
import imblearn
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization, Activation, Conv2D, MaxPooling2D, Reshape, Embedding, LSTM, TimeDistributed, Bidirectional, Lambda, Input, Add, GlobalMaxPool1D
from tensorflow.keras import regularizers, optimizers
from sklearn.metrics import r2_score
from tensorflow.keras.models import load_model
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
#import cv2
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
# to define loss
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.backend import log, epsilon
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import imdb
from itertools import islice
import re
import nltk
from nltk.corpus import stopwords
from ftfy import fix_encoding, fix_text, badness
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.utils import resample
import pickle, string
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import spacy
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
SEED = 123 # to be able to rerun the same NN
np.random.seed(SEED)
tf.random.set_seed(SEED)
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
! pip install langdetect
from langdetect import detect
from langdetect import detect
!pip install goslate
from goslate import Goslate
!pip install spacy
from collections import defaultdict
from bs4 import BeautifulSoup
import gensim
import gensim.corpora as corpora
#Remove stemming(snowball stemming) add lemmatistaion using simple_process from gensim
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
dataset = pd.read_excel('/content/sample_data/input_data.xlsx')
dataset.shape
dataset.isnull().sum()
dataset[pd.isnull(dataset).any(axis=1)]
dataset.fillna(str(), inplace=True)
dataset.isnull().sum()
duplicate = dataset[dataset.duplicated()]
duplicate.info()
dataset1 = dataset[~dataset.duplicated()]
dataset1.info()
dataset1.head(20)
plt.figure(figsize=(20,15))
dataset1['Assignment group'].value_counts().plot(kind='bar')
plt.figure(figsize=(20,15))
dataset1['Assignment group'].value_counts(normalize=True).plot(kind='bar')
display(dataset1['Assignment group'].value_counts(normalize=True))
top_n = 5
df_sample2 = dataset1['Caller'].groupby(dataset1['Assignment group']).value_counts()
caller_grp = pd.DataFrame(df_sample2.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)
top_n = 10
df_sample3 = pd.DataFrame(dataset1.groupby('Assignment group').size(),columns = ['Count']).reset_index()
top_grps = df_sample3.nlargest(top_n, 'Count')['Assignment group'].tolist()
fig_cols = 5
fig_rows = int(np.ceil(top_n/fig_cols))
fig, axes = plt.subplots(fig_rows, fig_cols, figsize=(13,9.5))
fig.suptitle('Top 5 callers in each of top 10 assignment groups- Pie Chart (Fig-8)', y=1, va= 'bottom', size='20')
for row in range(fig_rows):
for col in range(fig_cols):
grp_n = fig_cols * row + col
if grp_n < top_n:
xs = caller_grp.xs(top_grps[grp_n])
_ = axes[row,col].pie(xs, autopct='%1.1f%%', explode=[0.05]*5)
axes[row,col].legend(labels=xs.index,loc="best")
axes[row,col].axis('equal')
axes[row,col].set_title(top_grps[grp_n])
plt.tight_layout()
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
# nothing weird, should be okay
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
# Not CP-1252 encodable, probably fine
return True
else:
# Encodable as CP-1252, Mojibake alert level high
return False
dataset1[~dataset1.iloc[:,:].applymap(is_mojibake_impacted).all(1)]
dataset1['Short description']=dataset1['Short description'].apply(fix_text)
dataset1['Description']=dataset1['Description'].apply(fix_text)
dataset1.loc[7581]
def fn_lan_detect(df):
try:
return detect(df)
except:
return 'no'
dataset1['Language'] = dataset1['Description'].apply(fn_lan_detect)
x = dataset1["Language"].value_counts()
x=x.sort_values(ascending=False)
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Distribution of text by language")
plt.ylabel('number of records')
plt.xlabel('Language')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();
dataset1_nonEnglish = pd.DataFrame(dataset1[dataset1["Language"]!='en'])
x = dataset1_nonEnglish["Language"].value_counts()
x=x.sort_values(ascending=False)
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Distribution of text by Non English language")
plt.ylabel('number of records')
plt.xlabel('Language')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();
dataset1['Language'].value_counts()
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
def fn_remove_irrelaventWords(df,columnName):
for index in range(df.shape[0]):
df[columnName][index] = df[columnName][index].lower() # to lower case
df[columnName][index] = re.sub(email_regex,"",df.loc[index,columnName]) # remove email address
df[columnName][index] = re.sub(r'\S*@\S*\s?', '', df.loc[index,columnName]) # remove email address with appended text
df[columnName][index] = re.sub(r"received from:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"from:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"to:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"subject:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"sent:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"ic:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"cc:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r"bcc:",' ',df.loc[index,columnName]) # remove unwanted text
df[columnName][index] = re.sub(r'\d+','' ,df.loc[index,columnName]) # remove numbers
df[columnName][index] = re.sub(r'\n',' ',df.loc[index,columnName]) # remove new line character
df[columnName][index] = re.sub(r'#','', df.loc[index,columnName]) # remove hashtag while keeping hashtag text
df[columnName][index] = re.sub(r'&;?', 'and',df.loc[index,columnName]) # remove &
df[columnName][index] = re.sub(r'\&\w*;', '', df.loc[index,columnName]) # remove HTML special entities (e.g. &)
df[columnName][index] = re.sub(r'https?:\/\/.*\/\w*', '', df.loc[index,columnName]) # remove hyperlinks
df[columnName][index] = re.sub(r"hello",' ',df.loc[index,columnName]) # remove 'hello' word
df[columnName][index] = re.sub(r"_",' ',df.loc[index,columnName]) # remove _
df[columnName][index] = re.sub(r"-",' ',df.loc[index,columnName]) # remove -
df[columnName][index] = re.sub(r",",' ',df.loc[index,columnName])
df[columnName][index] = re.sub(r"\(",' ',df.loc[index,columnName])
df[columnName][index] = re.sub(r"\)",' ',df.loc[index,columnName])
return df
df_clean = dataset1.reset_index()
df_clean.info()
df_clean = fn_remove_irrelaventWords(df_clean,'Description')
df_clean = fn_remove_irrelaventWords(df_clean,'Short description')
df_clean.tail(20)
df_clean = df_clean.drop(columns=['index'],axis=1)
df_clean.info()
def fn_removeCaller(df,columnName):
for index in range(df.shape[0]):
if (df['Caller'][index] in df[columnName][index]):
df[columnName][index] = df[columnName][index].replace(df['Caller'][index],"person")
if (df['Caller'][index].replace(" ","") in df[columnName][index]):
df[columnName][index] = df[columnName][index].replace(df['Caller'][index].replace(" ",""),"person")
return df
df_clean = fn_removeCaller(df_clean,'Description')
df_clean = fn_removeCaller(df_clean,'Short description')
df_clean.tail(10)
df_clean.to_excel("df_clean_LangDetected.xlsx")
# svc_domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
# svc_urls = ['http://translate.google' + domain for domain in svc_domains]
# gs = Goslate(service_urls=svc_urls)
# trans_8416 = gs.translate(df_clean['Description'][8416], target_language='en', source_language='auto')
# print ('Original Text : ',df_clean['Description'][8416])
# print('Traslated to English : ',trans_8416)
# def fn_ConvertToEnglish(df,columnName):
# for idx in range(df.shape[0]):
# row_iter = gs.translate(df[columnName][idx],target_language='en',source_language = 'auto')
# df[columnName][idx] = str(row_iter)
# return df
# df_lang = fn_ConvertToEnglish(df_clean,'Description')
# df_lang = fn_ConvertToEnglish(df_clean,'Short description')
#df_lang.to_excel("df_clean_LangDetected_Translated.xlsx")
df_lang = pd.read_excel("/content/sample_data/df_clean_LangDetected_Translated.xlsx")
df_lang.info()
df_lang = df_lang.drop(columns=['Unnamed: 0'],axis=1)
df_lang.info()
df_lang[pd.isnull(df_lang).any(axis=1)]
df_lang.fillna(str(), inplace=True)
df_lang.isnull().sum()
df_lang.insert(loc=4,
column='combined_description',
allow_duplicates=True,
value=list(df_lang['Short description'].str.strip() + ' ' + df_lang['Description'].str.strip()))
df_lang.info()
df_lang.head()
def fn_remove_irrelaventWords_LevelTwo(df,columnName):
for index in range(df.shape[0]):
df[columnName][index] = df[columnName][index].lower() # to lower case
df[columnName][index] = df[columnName][index].replace("// ::",' ')
df[columnName][index] = df[columnName][index].replace("<",' ')
df[columnName][index] = df[columnName][index].replace(">",' ')
df[columnName][index] = df[columnName][index].replace(";",' ')
df[columnName][index] = df[columnName][index].replace(".",' ')
df[columnName][index] = df[columnName][index].replace("•",' ')
df[columnName][index] = df[columnName][index].replace("?",' ')
df[columnName][index] = df[columnName][index].replace("\\",' ')
df[columnName][index] = df[columnName][index].replace("\/",' ')
df[columnName][index] = df[columnName][index].replace(":",' ',)
df[columnName][index] = df[columnName][index].replace("%",' ',)
df[columnName][index] = df[columnName][index].replace("=",' ',)
df[columnName][index] = df[columnName][index].replace("[mail ]",' ')
df[columnName][index] = df[columnName][index].replace("[",' ')
df[columnName][index] = df[columnName][index].replace("]",' ')
df[columnName][index] = df[columnName][index].replace("< mail >",' ')
df[columnName][index] = df[columnName][index].replace("+",' ')
df[columnName][index] = df[columnName][index].replace("\"",' ')
df[columnName][index] = df[columnName][index].replace("' ",' ')
df[columnName][index] = df[columnName][index].replace(" '",' ')
df[columnName][index] = df[columnName][index].replace("* * * ",' ')
df[columnName][index] = df[columnName][index].replace(" * * *",' ')
df[columnName][index] = df[columnName][index].replace("* ",' ')
df[columnName][index] = df[columnName][index].replace(" *",' ')
df[columnName][index] = df[columnName][index].replace("/ ",' ')
df[columnName][index] = df[columnName][index].replace("撤回 ",' ')
df[columnName][index] = df[columnName][index].replace("答复 ",' ')
df[columnName][index] = df[columnName][index].replace("*",' ')
df[columnName][index] = df[columnName][index].replace("/",' ')
return df
df_lang_clean = fn_remove_irrelaventWords_LevelTwo(df_lang,"combined_description")
df_lang_clean.head(20)
df_lang_clean.to_excel("df_Moji_langDet_Translated_clean_combinedDesc.xlsx")
def deterministicRules(df,columnName):
for i in range(df.shape[0]):
#1 Contains telephony_software > GRP_7
if pd.notna(df[columnName][i]):
if ('telephony software' in df[columnName][i]):
df['pred_group'][i] = 'GRP_7'
#2 contains cutview > GRP_66
elif ('cutview' in df[columnName][i]):
df['pred_group'][i] = 'GRP_66'
#3 contains engg application > GRP_58
elif ('engg application' in df[columnName][i]):
df['pred_group'][i] = 'GRP_58'
#4 contains ethics > GRP_23
elif ('ethics' in df[columnName][i]):
df['pred_group'][i] = 'GRP_23'
# contains crm dynamics > GRP_22
elif ('crm dynamics' in df[columnName][i]):
df['pred_group'][i] = 'GRP_22'
# contains distributor tool & company center > GRP_21
elif ('distributor tool' in df[columnName][i]):
df['pred_group'][i] = 'GRP_21'
elif ('company center' in df[columnName][i]):
df['pred_group'][i] = 'GRP_21'
# contains bpctwhsn kzqsbmtp & network outage or circuit outage > GRP_8
elif ((df['Caller'][i] == 'bpctwhsn kzqsbmtp' and ('network outage' in df[columnName][i]) or 'circuit outage' in df[columnName][i])):
df['pred_group'][i] = 'GRP_8'
elif ('reset passwords' in df[columnName][i] and 'the' in df[columnName][i]):
df['pred_group'][i] = 'GRP_17'
elif (df[columnName][i].startswith('erp access issue')):
df['pred_group'][i] = 'GRP_2'
elif ('vsphere' in df[columnName][i] or 'esxi' in df[columnName][i]):
df['pred_group'][i] = 'GRP_12'
elif ('windows account' in df[columnName][i]):
df['pred_group'][i] = 'GRP_0'
elif ('erp sid account lock' in df[columnName][i]):
df['pred_group'][i] = 'GRP_0'
elif ('erp sid password reset' in df[columnName][i]):
df['pred_group'][i] = 'GRP_0'
elif(df['Caller'][i] == 'jionmpsf wnkpzcmv' and 'eutool' in df[columnName][i]):
df['pred_group'][i] = 'GRP_24'
elif(df['Caller'][i] == 'cwrikael oanmsecr' and 'eutool' in df[columnName][i]):
df['pred_group'][i] = 'GRP_0'
elif ('sso portal' in df[columnName][i]):
df['pred_group'][i] = 'GRP_73'
elif ('unable complete forecast' in df[columnName][i]):
df['pred_group'][i] = 'GRP_67'
elif (df[columnName][i].startswith('timecard') or df[columnName][i].startswith('time card')):
df['pred_group'][i] = 'GRP_36'
df_lang_clean.insert(loc=5,column='pred_group',value=np.nan,allow_duplicates=True)
df_lang_clean.info()
deterministicRules(df_lang_clean,"combined_description")
df_determinted = df_lang_clean[~df_lang_clean['pred_group'].isna()]
df_determinted.info()
df_determinted.head()
df_NonDet = df_lang_clean[df_lang_clean['pred_group'].isna()]
df_NonDet = df_NonDet.reset_index()
df_NonDet = df_NonDet.drop(columns=['index'],axis=1)
df_NonDet.info()
df_NonDet.insert(loc=4,column='New Assignment Group',value=np.nan,allow_duplicates=True)
groupsToBeMerged = pd.DataFrame(df_NonDet['Assignment group'].value_counts() <=10)
groupsToBeMerged = groupsToBeMerged[groupsToBeMerged['Assignment group'] == True]
groupsToBeMergedList = list(groupsToBeMerged.index)
groupsToBeMergedList
len(groupsToBeMergedList)
df_NonDet.info()
for index in range(df_NonDet.shape[0]):
if (df_NonDet['Assignment group'][index] in groupsToBeMergedList):
df_NonDet['New Assignment Group'][index] = 'GRP_99'
else:
df_NonDet['New Assignment Group'][index] = df_NonDet['Assignment group'][index]
df_NonDet.tail(30)
df_NonDet['New Assignment Group'].unique()
df_NonDet['New Assignment Group'].value_counts()
df_ML = df_NonDet.copy() # Creating copy
df_DL = df_NonDet.copy()
df_ML.shape, df_DL.shape
df_DL.to_excel("df_DL.xlsx")
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
# Remove stopwords
df_ML['combined_description'] = df_ML['combined_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
df_ML.head()
# Initialize spacy 'en' medium model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
# Define a function to lemmatize the descriptions
def lemmatizer(sentence):
# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)
return " ".join([token.lemma_ for token in doc if token.lemma_ !='-PRON-'])
df_ML['combined_description'] = df_ML['combined_description'].apply(lemmatizer)
df_ML.head(10)
df_ML.to_excel("df_ML_StopWords_Lemmatized.xlsx")
# define wordcloud function from wordcloud library. set some parameteres for beatuful plotting
wc = WordCloud(width = 3000,
height = 2000,max_words=300,
background_color = 'black')
# generate word cloud for data
wc_word=wc.generate(str(df_ML.combined_description))
print(wc_word)
# declare our figure
plt.figure(figsize=(20,10), facecolor='k')
# add title to the graph
plt.title("Most frequent words in dataset", fontsize=20,color='white')
plt.imshow(wc_word)
plt.show()
#ngram function
def ngram_extractor(text, n_gram):
token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
ngrams = zip(*[token[i:] for i in range(n_gram)])
return [" ".join(ngram) for ngram in ngrams]
# Function to generate a dataframe with n_gram and top max_row frequencies
def generate_ngrams(df, n_gram, max_row):
temp_dict = defaultdict(int)
for question in df:
for word in ngram_extractor(question, n_gram):
temp_dict[word] += 1
temp_df = pd.DataFrame(sorted(temp_dict.items(), key=lambda x: x[1])[::-1]).head(max_row)
temp_df.columns = ["word", "wordcount"]
return temp_df
df_ML.info()
Ticket_desc = df_ML['combined_description']
#Define empty list
ticket_desc_cleaned = []
res = []
#Define for loop to iterate through the elements of the ticket_desc
for l in Ticket_desc:
#Parse the contents of the cell
soup = BeautifulSoup(l, 'html.parser')
#Find all instances of the text within the </p> tag
for el in soup.find_all('p'):
res.append(el.get_text())
#concatenate the strings from the list
endstring = ' '.join(map(str, res))
#reset list
res = []
#Append the concatenated string to the main list
ticket_desc_cleaned.append(endstring)
ticket_desc_na_cleaned = [item.lower() for item in Ticket_desc]
#remove html links from list
ticket_desc_na_cleaned = [re.sub(r"http\S+", "", item) for item in ticket_desc_na_cleaned]
#remove special characters left
ticket_desc_na_cleaned = [re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", item) for item in ticket_desc_na_cleaned]
#convert to dataframe and rename the column of the ticket_desc_na_cleaned list
ticket_desc_clean = pd.DataFrame(np.array(ticket_desc_na_cleaned).reshape(-1))
ticket_desc_clean.columns = ["ans"]
#Squeeze dataframe to obtain series
desc_cleaned = ticket_desc_clean.squeeze()
#generate unigram
ans_unigram = generate_ngrams(desc_cleaned, 1, 30)
ans_unigram.head()
ans_unigram.info()
#generate barplot for unigram
plt.figure(figsize=(12,8))
sns.barplot(ans_unigram["wordcount"],ans_unigram["word"])
plt.xlabel("Word Count", fontsize=15)
plt.ylabel("Unigrams", fontsize=15)
plt.title("Top 30 Unigrams for Combined Column Translated to English")
plt.show()
# define wordcloud function from wordcloud library. set some parameteres for beatuful plotting
wc = WordCloud(width = 3000,
height = 2000,max_words=300,
background_color = 'black')
# generate word cloud for data
#wc_word=wc.generate(str(ans_unigram))
wc_word=wc.generate(str(ans_unigram['word']))
print(wc_word)
# declare our figure
plt.figure(figsize=(20,10), facecolor='k')
# add title to the graph
plt.title("Most frequent words in dataset", fontsize=20,color='white')
plt.imshow(wc_word)
plt.show()
#generate bigram
ans_bigram = generate_ngrams(desc_cleaned, 2, 20)
#generate barplot for bigram
plt.figure(figsize=(12,8))
sns.barplot(ans_bigram["wordcount"],ans_bigram["word"])
plt.xlabel("Word Count", fontsize=15)
plt.ylabel("Bigrams", fontsize=15)
plt.title("Top 20 Bigrams for Combined Column Translated to English")
plt.show()
# define wordcloud function from wordcloud library. set some parameteres for beatuful plotting
wc = WordCloud(width = 3000,
height = 2000,max_words=300,
background_color = 'black')
# generate word cloud for data
wc_word=wc.generate(str(ans_bigram['word']))
print(wc_word)
# declare our figure
plt.figure(figsize=(20,10), facecolor='k')
# add title to the graph
plt.title("Most frequent words in dataset", fontsize=20,color='white')
plt.imshow(wc_word)
plt.show()
#generate trigram
ans_trigram = generate_ngrams(desc_cleaned, 3, 20)
#generate barplot for bigram
plt.figure(figsize=(12,8))
sns.barplot(ans_trigram["wordcount"],ans_trigram["word"])
plt.xlabel("Word Count", fontsize=15)
plt.ylabel("Trigrams", fontsize=15)
plt.title(" Top 20 Trigrams for ticket description")
plt.show()
# define wordcloud function from wordcloud library. set some parameteres for beatuful plotting
wc = WordCloud(width = 3000,
height = 2000,max_words=300,
background_color = 'black')
# generate word cloud for data
wc_word=wc.generate(str(ans_trigram['word']))
print(wc_word)
# declare our figure
plt.figure(figsize=(20,10), facecolor='k')
# add title to the graph
plt.title("Most frequent words in dataset", fontsize=20,color='white')
plt.imshow(wc_word)
plt.show()
#https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(df_ML['combined_description']))
print(len(data_words))
# Build the bigram and trigram models
#https://radimrehurek.com/gensim/models/phrases.html
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(bigram_mod[data_words[1]])
print(trigram_mod[data_words[1]])
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
# Form Bigrams
data_words_trigrams = make_trigrams(data_words)
wordclouds=' '.join(map(str, data_words_trigrams))
wordCloudfinal = wordclouds.replace('\', \'',' ').replace(']',' ').replace('[',' ').replace('\'','').replace(' ',' ').replace('*','')
wordcloud = WordCloud(width=480, height=480, max_font_size=20, min_font_size=10).generate(wordCloudfinal)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
new_df = df_ML.copy()
new_df['words'] = data_words_trigrams
new_df.head()
sortedListOfGroup = df_ML['New Assignment Group'].value_counts().sort_values(ascending=False).index
sortedListOfGroup
def wordcloud_grp(f, x):
wordclouds_0=' '.join(map(str, f))
wc = WordCloud(width=480, height=480, max_font_size=20, min_font_size=10, max_words=50).generate(wordclouds_0.replace('\'',''))
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Most common 50 words of {}".format(x))
plt.margins(x=0, y=0)
plt.show()
for i in range(10):
Grp = new_df[new_df ['New Assignment Group'] == sortedListOfGroup[i]]
Grp = Grp['words']
wordcloud_grp(Grp,sortedListOfGroup[i])
df_ML.info()
df_DL.info()
Making copies of the datasets for different steps
df_ML_Aug = df_ML.copy()
df_ML_NonAug = df_ML.copy()
df_DL_Aug = df_DL.copy()
df_DL_NonAug = df_DL.copy()
df_ML_Aug.shape, df_ML_NonAug.shape, df_DL_Aug.shape,df_DL_NonAug.shape
df_ML_Aug.to_excel("df_ML_Aug.xlsx")
df_ML_NonAug.to_excel("df_ML_NonAug.xlsx")
df_DL_Aug.to_excel("df_DL_Aug.xlsx")
df_DL_NonAug.to_excel("df_DL_NonAug.xlsx")
11.1 ML Model without upsampling or augmentation
Here we maintain the classes as per their original proportion and try constructing the model to see how the performance looks
df_ML_NonAug.info()
df_ML_NonAug.head()
df_ML_NonAug['target'] = df_ML_NonAug['Assignment group'].astype('category').cat.codes
df_ML_NonAug.groupby(["Assignment group", "target"]).size()
# Create training and test datasets with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(df_ML_NonAug.combined_description ,
df_ML_NonAug.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
def fit_n_print(model, X_train, X_test, y_train, y_test): # take the model, train data and test data as input
start = time.time() # note the start time
clf = model
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', model),
])
clf.fit(X_train, y_train) # fit the model using the train data
pred_train=clf.predict(X_train) # model predictions on the training data
y_pred = clf.predict(X_test) # model predictions on the test data
accuracy_training = (accuracy_score(y_train,pred_train))*100 #accurancy on training
accuracy_test= (accuracy_score(y_test,y_pred ))*100 #accuracy on test
recallscore_training =(recall_score(y_train,pred_train,average='weighted'))*100 #recall on training
recallscore_test = (recall_score(y_test,y_pred, average='weighted'))*100 #recall on test
precision_training = (precision_score(y_train,pred_train, average='weighted'))*100
precision_test = (precision_score(y_test,y_pred,average='weighted'))*100
f1score_training = (f1_score(y_train,pred_train, average='weighted'))*100
f1score_test = (f1_score(y_test,y_pred, average='weighted'))*100
end = time.time() #note the end time
duration = end - start # calculate the total duration
print('Algorithm:', type(model).__name__)
print("\n Classification report:\n", classification_report(y_test, y_pred))
print("\n Confusion report:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print()
print("\n \n")
return accuracy_training,accuracy_test,recallscore_training, recallscore_test, precision_training,precision_test,f1score_training, f1score_test, duration, y_pred # return all the metrics along with predictions
import time
rf = RandomForestClassifier()
xgb = XGBClassifier()
SVC = LinearSVC()
KNN = KNeighborsClassifier()
NB = MultinomialNB()
result = {} # Create an empty dictionary to later use to store metrics of each of the models
for model, name in zip([rf,xgb, SVC,KNN,NB],
['Random Forest', 'Xgboost', 'SVC','KNN','Naive Bayes']):
result[name] = fit_n_print(model,X_train, X_test, y_train, y_test)
result_without_aug = pd.DataFrame(np.array(list(result.values()))[:,:-1], # make a dataframe out of the metrics from result dictionary
columns= ['accuracy_training','accuracy_test',
'recallscore_training', 'recallscore_test',
'precision_training','precision_test',
'f1score_training', 'f1score_test',
'Elapsed'],
index= result.keys()) # use the model names as index
result_without_aug.index.name = 'Model' # name the index of the result1 dataframe as 'Model'
result_without_aug
Without any class imbalance treatment, we see that the test data accuracy is low, and in some models, we see overfitting too. We will try treating class imbalance through augmentation next
11.2 ML Model with class imbalance treatment via augmentation
Split train test
Augment data
df_ML_Aug.info()
df_ML_Aug.shape
df_ML_Aug.head()
#import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!pip install nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
aug = naw.SynonymAug(aug_src='wordnet',aug_max=4)
# Take an example for augmentation
example=aug.augment(X_train[5],n=2)
print('\033[1mOriginal text:\033[0m')
print(X_train[5])
print('_'*100)
print('\033[1mAugmented text:\033[0m')
print(example[0])
print(example[1])
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
if y_train[i] in (24,9,12,2,19,3,6):
temps=aug.augment(X_train[i],n=3)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (13,10,5,14,24,31,18,28,4,16,47):
temps=aug.augment(X_train[i],n=6)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (30,32,25,27,37,15,38,29,40,36,11,20,1,42,41,22):
temps=aug.augment(X_train[i],n=12)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (46,7,35,17,26,39,34,45,43,33,21,44):
temps=aug.augment(X_train[i],n=24)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
df_ML_Aug['target'] = df_ML_Aug['Assignment group'].astype('category').cat.codes
# Create training and test datasets with 80:20 ratio without augmenatation
X_train, X_test, y_train, y_test = train_test_split(df_ML_Aug.combined_description,
df_ML_Aug.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)
print(X_train.shape)
print(y_train.shape)
y_train_df = pd. DataFrame(y_train,columns=['target'])
y_train_df.info()
#Create Dataset
#y_train_df_old_nogrp0 = dataset1[dataset1['Assignment group'] != 'GRP_0']
descending_order = y_train_df['target'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='target', data=y_train_df, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.title('Group count after augmentation')
plt.show()
rf = RandomForestClassifier()
xgb = XGBClassifier()
SVC = LinearSVC()
KNN = KNeighborsClassifier()
NB = MultinomialNB()
result = {} # Create an empty dictionary to later use to store metrics of each of the models
for model, name in zip([rf,xgb, SVC,KNN,NB],
['Random Forest', 'Xgboost', 'SVC','KNN','Naive Bayes']):
result[name] = fit_n_print(model,X_train, X_test, y_train, y_test)
result_with_aug = pd.DataFrame(np.array(list(result.values()))[:,:-1], # make a dataframe out of the metrics from result dictionary
columns= ['accuracy_training','accuracy_test',
'recallscore_training', 'recallscore_test',
'precision_training','precision_test',
'f1score_training', 'f1score_test',
'Elapsed'],
index= result.keys()) # use the model names as index
result_with_aug.index.name = 'Model' # name the index of the result1 dataframe as 'Model'
result_with_aug
even after augmentation, we see test data accuracy is low and there's high evidence of overfitting. We will try treating class imbalance through upsampling/resampling next
11.3 ML Model with class imbalance treated via upsampling/resampling
df_ML_US =df_ML_Aug.copy()
df_ML_US.info()
df_ML_US['target'] = df_ML_US['Assignment group'].astype('category').cat.codes
#Create Dataset for 'others' i.e all groups which is not part of GRP_0
df_ML_US_nogrp0 = df_ML_US[df_ML_US['New Assignment Group'] != 'GRP_0']
descending_order = df_ML_US_nogrp0['New Assignment Group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='New Assignment Group', data=df_ML_US_nogrp0, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
maxcount = df_ML_US_nogrp0['New Assignment Group'].value_counts().max()
maxcount
df_ML_US_grp0 = df_ML_US[df_ML_US['New Assignment Group'] == 'GRP_0']
df_ML_US_grp0.info()
# Treat the imbalance in the dataset by resampling to 591
df_ML_US_nogrp0_upsampled = df_ML_US_nogrp0[0:0]
for grp in df_ML_US_nogrp0['New Assignment Group'].unique():
df_ML_US_nogrp0_grp = df_ML_US_nogrp0[df_ML_US_nogrp0['New Assignment Group'] == grp]
resampled = resample(df_ML_US_nogrp0_grp, replace=True, n_samples=int(maxcount), random_state=SEED)
df_ML_US_nogrp0_upsampled = df_ML_US_nogrp0_upsampled.append(resampled)
dataset_ML_upsampled=pd.concat([df_ML_US_nogrp0_upsampled, df_ML_US_grp0],ignore_index=True)
descending_order = dataset_ML_upsampled['New Assignment Group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='New Assignment Group', data=dataset_ML_upsampled, color='royalblue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
# Create a column for target (assignment group) variable
dataset_ML_upsampled['target'] = dataset_ML_upsampled['New Assignment Group'].str[4:]
dataset_ML_upsampled['target'] = dataset_ML_upsampled['target'].astype('category').cat.codes
dataset_ML_upsampled.info()
dataset_ML_upsampled.groupby(["New Assignment Group", "target"]).size()
dataset_ML_upsampled.info()
dataset_ML_upsampled.to_excel("dataset_ML_upsampled.xlsx")
# Create training and test datasets with 80:20 ratio without augmenatation
X_train, X_test, y_train, y_test = train_test_split(dataset_ML_upsampled.combined_description,
dataset_ML_upsampled.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
X_train.dtypes, X_test.dtypes, y_train.dtypes, y_test.dtypes
rf = RandomForestClassifier()
xgb = XGBClassifier()
SVC = LinearSVC()
KNN = KNeighborsClassifier()
NB = MultinomialNB()
result = {} # Create an empty dictionary to later use to store metrics of each of the models
for model, name in zip([rf,xgb, SVC,KNN,NB],
['Random Forest', 'Xgboost', 'SVC','KNN','Naive Bayes']):
result[name] = fit_n_print(model,X_train, X_test, y_train, y_test)
result_ML_US= pd.DataFrame(np.array(list(result.values()))[:,:-1], # make a dataframe out of the metrics from result dictionary
columns= ['accuracy_training','accuracy_test',
'recallscore_training', 'recallscore_test',
'precision_training','precision_test',
'f1score_training', 'f1score_test',
'Elapsed'],
index= result.keys()) # use the model names as index
result_ML_US.index.name = 'Model' # name the index of the result1 dataframe as 'Model'
result_ML_US
result_ML_US.to_excel('result_ML_US.xlsx')
ML model with upsampling has overcome the problem of overfitting. We will do randomised gridsearch on this for hypertuning.
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV,cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
# Create training and test datasets with 80:20 ratio without augmenatation
X_train, X_test, y_train, y_test = train_test_split(dataset_ML_upsampled.combined_description,
dataset_ML_upsampled.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
Since, Random Forest gave highest accuracy, precision and recall among all ML Models, we will hyper tune Random Forest Model
11.4 HYPER TUNING RANDOM FOREST
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier()),
])
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__bootstrap': [True],
'clf__max_depth': [ None, 10,20,30, 40,50],
'clf__max_features': ['auto', 'sqrt'],
'clf__min_samples_leaf': [None,1,2, 4, 8,10],
'clf__n_estimators': [100]}
if __name__ == "__main__":
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1,cv=5)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
RF_CV_Fit = grid_search.fit(X_train, y_train)
#print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % RF_CV_Fit.best_score_)
print("Best parameters set:")
best_parameters = RF_CV_Fit.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
#print(RF_CV_Fit.cv_results_)
print("Best score of Random Forest Hyper Tuning using GridSearchCV: %0.3f" % RF_CV_Fit.best_score_)
RF_CV_Fit.get_params()
# Load the dataset
#dataset2_DL = df_DL[df_DL['pred_group'].isna()]
dataset2_DL = pd.read_excel('/content/sample_data/df_DL_Aug.xlsx')
dataset2_DL.head()
dataset2_DL.info()
dataset2_DL = dataset2_DL[dataset2_DL['pred_group'].isna()]
dataset2_DL.info()
dataset2_DL.shape
# Create a column for target (assignment group) variable
dataset2_DL['group'] = dataset2_DL['New Assignment Group'].str[4:]
#from sklearn. preprocessing import LabelEncoder
#le = LabelEncoder()
#dataset2['group'] = le. fit_transform(dataset2['group'])
dataset2_DL['group'] = dataset2_DL['group'].astype('int8')
dataset2_DL['target'] = dataset2_DL['group'].astype('category').cat.codes
dataset2_DL.info()
dataset2_DL.groupby(["group", "target"]).size()
dataset2_DL=dataset2_DL.drop(columns=['Unnamed: 0'],axis=1)
dataset2_DL.info()
12.1 Trying DL without augmentation or replacement
# Create a column for target (assignment group) variable
dataset2_DL_noaug=dataset2_DL.copy()
max_features = 10000
maxlen = 300
embedding_size = 200
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(dataset2_DL_noaug['combined_description']))
X = tokenizer.texts_to_sequences(dataset2_DL_noaug['combined_description'])
X = pad_sequences(X, maxlen = maxlen)
Y = np.asarray(dataset2_DL_noaug['target'])
#print sample headline and lable
print(f'\nsample headline:\n{X[0]}\n\n Label of sample headline: {Y[0]}')
tokenizer.word_index
num_words = len(tokenizer.word_index) + 1
print(num_words)
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
#Extract Glove embedding zip file
# from zipfile import ZipFile
# with ZipFile('/content/drive/MyDrive/Great_Learning/NLP_sarcasm_detection/glove.6B.200d.txt', 'r') as z:
# z.extractall()
EMBEDDING_FILE = '/content/drive/MyDrive/Great_Learning/NLP_sarcasm_detection/glove.6B.200d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
#print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
#print(embd)
embeddings[word] = embd
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Create training and test datasets with 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = SEED, shuffle = True)
print('\033[1mShape of the training set:\033[0m', x_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', x_test.shape, y_test.shape)
print(f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')
print(f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')
# converting y data into categorical (one-hot encoding)
ytrain = to_categorical(y_train)
ytest = to_categorical(y_test)
print(f'\nsample headline:\n{x_train[50]}\n\n Label of sample headline: {ytrain[50]}\n\n Label of sample headline: {y_train[50]}')
num_class = len(np.unique(y_train))
num_class
num_class_test = len(np.unique(y_test))
num_class_test
num_class_all = len(np.unique(dataset2_DL['group'].values))
num_class_all
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_td = Model(input_layer,out)
model_td.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model_td.summary()
tf.keras.utils.plot_model(model_td, show_shapes = True)
es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 1, patience = 5)
mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
batch_size = 100
epochs = 10
model_td_history = model_td.fit(x_train,ytrain,batch_size=batch_size, epochs=epochs, callbacks=[es,mc,lr_reduction], validation_data = (x_test, ytest),verbose=True)
# Evaluate the model & report accuracy
from keras.models import load_model
model_td_saved = load_model("model-010-0.634224.h5")
scores = model_td_saved.evaluate(x_test, ytest, batch_size = 100, verbose = 1)
print('Test accuracy: %.2f%%' % (scores[1]*100))
#displaying the classification report on test/validation data
yfit = model_td_saved.predict(x_test)
test_predicted = np.argmax ( yfit, axis=-1 )
test = np.argmax ( ytest, axis=-1 )
from sklearn.metrics import classification_report
print(classification_report(test,test_predicted))
#visualizing model performance - loss and accuracy
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Monitoring the performance of the model')
ax1.plot(model_td_history.history['loss'], label = 'Train')
ax1.plot(model_td_history.history['val_loss'], label = 'Test')
ax1.set_title('Model Loss')
ax1.legend(['Train', 'Test'])
ax2.plot(model_td_history.history['accuracy'], label = 'Train')
ax2.plot(model_td_history.history['val_accuracy'], label = 'Test')
ax2.set_title('Model Accuracy')
ax2.legend(['Train', 'Test'])
plt.show()
12.2 DL Model with data Augmentation (synonym based)
# Create training and test datasets with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(dataset2_DL.combined_description,
dataset2_DL.target,
test_size=0.20,
random_state=SEED)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
#Create Dataset
y_train_df_old = pd.DataFrame(y_train,columns=['target'])
#y_train_df_old_nogrp0 = dataset1[dataset1['Assignment group'] != 'GRP_0']
descending_order = y_train_df_old['target'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='target', data=y_train_df_old, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.title('Group count before augmentation')
plt.show()
#Create Dataset for 'others' i.e all groups which is not part of GRP_0
#y_train_df_old = pd. DataFrame(y_train,columns=['group'])
y_train_df_old_nogrp0 = y_train_df_old[y_train_df_old['target'] != 0]
descending_order = y_train_df_old_nogrp0['target'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='target', data=y_train_df_old_nogrp0, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.title('Group count before augmentation excluding GRP_0')
plt.show()
# Take an example for augmentation
example=aug.augment(X_train[5],n=2)
print('\033[1mOriginal text:\033[0m')
print(X_train[5])
print('_'*100)
print('\033[1mAugmented text:\033[0m')
print(example[0])
print(example[1])
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
if y_train[i] in (24,9,12,2,19,3,6):
temps=aug.augment(X_train[i],n=3)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (13,10,5,14,24,31,18,28,4,16,47):
temps=aug.augment(X_train[i],n=6)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (30,32,25,27,37,15,38,29,40,36,11,20,1,42,41,22):
temps=aug.augment(X_train[i],n=12)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
elif y_train[i] in (46,7,35,17,26,39,34,45,43,33,21,44):
temps=aug.augment(X_train[i],n=24)
for sent in temps:
augmented_sentences.append(sent)
augmented_sentences_labels.append(y_train[i])
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)
print(X_train.shape)
print(y_train.shape)
y_train_df = pd. DataFrame(y_train,columns=['target'])
y_train_df.info()
#Create Dataset
#y_train_df_old_nogrp0 = dataset1[dataset1['Assignment group'] != 'GRP_0']
descending_order = y_train_df['target'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='target', data=y_train_df, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.title('Group count after augmentation')
plt.show()
#Create Dataset for 'others' i.e all groups which is not part of GRP_0
#y_train_df_old = pd. DataFrame(y_train,columns=['group'])
y_train_df_nogrp0 = y_train_df[y_train_df['target'] != 0]
descending_order = y_train_df_nogrp0['target'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='target', data=y_train_df_nogrp0, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.title('Group count after augmentation excluding GRP_0')
plt.show()
y_train_df.groupby(['target']).size()
max_features = 10000
maxlen = 300
embedding_size = 200
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(X_train)
x_train = tokenizer.texts_to_sequences(X_train)
x_test = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(x_train, padding='pre', maxlen = maxlen)
x_test = pad_sequences(x_test, padding='pre', maxlen = maxlen)
#Y = np.asarray(dataset2_DL['group'])
#print sample headline and lable
print(f'\nsample headline:\n{x_train[0]}\n\n Label of sample headline: {y_train[0]}')
tokenizer.word_index
num_words = len(tokenizer.word_index) + 1
print(num_words)
# Mounting Google Drive
#from google.colab import drive
#drive.mount('/content/drive')
#Extract Glove embedding zip file
#from zipfile import ZipFile
#with ZipFile('drive/My Drive/datasets/NLP/sarcasm_detection/Glove.6B.200d.zip', 'r') as z:
#z.extractall()
EMBEDDING_FILE = '/content/drive/MyDrive/Great_Learning/NLP_sarcasm_detection/glove.6B.200d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
# word = o.split(" ")[0]
# print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embd
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
#splitting dataset into train and test datasets
#x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = SEED, shuffle = True)
print(f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')
print(f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')
print(f'\nsample headline:\n{x_train[5]}\n\n Label of sample headline: {y_train[5]}')
num_class = len(np.unique(y_train.values))
num_class
num_class_test = len(np.unique(y_test.values))
num_class_test
num_class_all = len(np.unique(dataset2_DL['group'].values))
num_class_all
# converting y data into categorical (one-hot encoding)
ytrain = to_categorical(y_train)
ytest = to_categorical(y_test)
print(f'\nsample headline:\n{x_train[50]}\n\n Label of sample headline:\n{ytrain[50]}\n\n Label of sample headline:\n{y_train[50]}')
ytest.shape[0],ytest.shape[1]
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_td = Model(input_layer,out)
model_td.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model_td.summary()
tf.keras.utils.plot_model(model_td, show_shapes = True)
es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 1, patience = 5)
mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
batch_size = 100
epochs = 10
model_td_history = model_td.fit(x_train,ytrain,batch_size=batch_size, epochs=epochs, callbacks=[es,mc,lr_reduction], validation_data = (x_test, ytest),verbose=True)
# Evaluate the model & report accuracy
from keras.models import load_model
model_td_saved = load_model("model-009-0.617684.h5")
scores = model_td_saved.evaluate(x_test, ytest, batch_size = 100, verbose = 1)
print('Test accuracy: %.2f%%' % (scores[1]*100))
#displaying the classification report on test/validation data
yfit = model_td_saved.predict(x_test)
test_predicted = np.argmax ( yfit, axis=-1 )
test = np.argmax ( ytest, axis=-1 )
from sklearn.metrics import classification_report
print(classification_report(test,test_predicted))
#visualizing model performance - loss and accuracy
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Monitoring the performance of the model')
ax1.plot(model_td_history.history['loss'], label = 'Train')
ax1.plot(model_td_history.history['val_loss'], label = 'Test')
ax1.set_title('Model Loss')
ax1.legend(['Train', 'Test'])
ax2.plot(model_td_history.history['accuracy'], label = 'Train')
ax2.plot(model_td_history.history['val_accuracy'], label = 'Test')
ax2.set_title('Model Accuracy')
ax2.legend(['Train', 'Test'])
plt.show()
12.3 DL model with Upsampling/resampling technique (to treat class imabalance)
dataset2_DL_US=dataset2_DL.copy()
dataset2_DL_US.info()
#Create Dataset for 'others' i.e all groups which is not part of GRP_0
dataset2_DL_nogrp0 = dataset2_DL_US[dataset2_DL_US['New Assignment Group'] != 'GRP_0']
descending_order = dataset2_DL_nogrp0['New Assignment Group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='New Assignment Group', data=dataset2_DL_nogrp0, color='royalblue',order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
maxcount = dataset2_DL_nogrp0['New Assignment Group'].value_counts().max()
maxcount
dataset2_DL_grp0 = dataset2_DL_US[dataset2_DL_US['New Assignment Group'] == 'GRP_0']
dataset2_DL_grp0.info()
## Treat the imbalance in the dataset by resampling to 591
dataset2_DL_nogrp0_upsampled = dataset2_DL_nogrp0[0:0]
for grp in dataset2_DL_nogrp0['New Assignment Group'].unique():
dataset2_DL_nogrp0_grp = dataset2_DL_nogrp0[dataset2_DL_nogrp0['New Assignment Group'] == grp]
resampled = resample(dataset2_DL_nogrp0_grp, replace=True, n_samples=int(maxcount), random_state=123)
dataset2_DL_nogrp0_upsampled = dataset2_DL_nogrp0_upsampled.append(resampled)
dataset2_DL_upsampled=pd.concat([dataset2_DL_nogrp0_upsampled, dataset2_DL_grp0],ignore_index=True)
descending_order = dataset2_DL_upsampled['New Assignment Group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='New Assignment Group', data=dataset2_DL_upsampled, color='royalblue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
# Create a column for target (assignment group) variable
dataset2_DL_upsampled['group'] = dataset2_DL_upsampled['New Assignment Group'].str[4:]
#from sklearn. preprocessing import LabelEncoder
#le = LabelEncoder()
#dataset2['group'] = le. fit_transform(dataset2['group'])
dataset2_DL_upsampled['group'] = dataset2_DL_upsampled['group'].astype('int8')
dataset2_DL_upsampled['target'] = dataset2_DL_upsampled['group'].astype('category').cat.codes
dataset2_DL_upsampled.info()
dataset2_DL_upsampled.groupby(["group", "target"]).size()
max_features = 10000
maxlen = 300
embedding_size = 200
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(dataset2_DL_upsampled['combined_description']))
X = tokenizer.texts_to_sequences(dataset2_DL_upsampled['combined_description'])
X = pad_sequences(X, maxlen = maxlen)
Y = np.asarray(dataset2_DL_upsampled['target'])
#print sample headline and lable
print(f'\nsample headline:\n{X[0]}\n\n Label of sample headline: {Y[0]}')
tokenizer.word_index
word_index= tokenizer.word_index
num_words = len(tokenizer.word_index) + 1
print(num_words)
# # Mounting Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
#Extract Glove embedding zip file
#from zipfile import ZipFile
# with ZipFile('/content/drive/MyDrive/Great_Learning/archive.zip', 'r') as z:
# z.extractall()
EMBEDDING_FILE = '/content/drive/MyDrive/Great_Learning/NLP_sarcasm_detection/glove.6B.200d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
# print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embd
# load the whole embedding into memory
#embeddings_index = dict()
#f = open(project_path+'word2vec_vector.txt')
#for line in f:
# values = line.split()
# word = values[0]
# coefs = np.asarray(values[1:], dtype='float32')
# embeddings_index[word] = coefs
#f.close()
#print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Create training and test datasets with 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = SEED, shuffle = True)
print('\033[1mShape of the training set:\033[0m', x_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', x_test.shape, y_test.shape)
print(f'\nNumber of rows in training dataset: {x_train.shape[0]}')
print(f'Number of columns in training dataset: {x_train.shape[1]}')
print(f'Number of unique words in training dataset: {len(np.unique(np.hstack(x_train)))}')
print(f'\nNumber of rows in test dataset: {x_test.shape[0]}')
print(f'Number of columns in test dataset: {x_test.shape[1]}')
print(f'Number of unique words in test dataset: {len(np.unique(np.hstack(x_test)))}')
num_class = len(np.unique(y_train))
num_class
num_class_test = len(np.unique(y_test))
num_class_test
num_class_all = len(np.unique(dataset2_DL_upsampled['group'].values))
num_class_all
# converting y data into categorical (one-hot encoding)
ytrain = to_categorical(y_train,48)
ytest = to_categorical(y_test,48)
print(f'\nsample headline:\n{x_train[50]}\n\n Label of sample headline: {ytrain[50]}\n\n Label of sample headline: {y_train[50]}')
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
lstm= LSTM(128)(embed)
drop=Dropout(0.3)(lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_lstm= Model(input_layer,out)
model_lstm.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model_lstm.summary()
tf.keras.utils.plot_model(model_lstm, show_shapes = True)
es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 1, patience = 5)
mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
batch_size = 100
epochs = 10
model_lstm_history = model_lstm.fit(x_train,ytrain,batch_size=batch_size, epochs=epochs,
callbacks=[es,mc,lr_reduction], validation_data = (x_test, ytest),verbose=True)
# Evaluate the model & report accuracy
from keras.models import load_model
model_lstm_saved = load_model("model-009-0.938931.h5")
scores = model_lstm_saved.evaluate(x_test, ytest, batch_size = 100, verbose = 1)
print('Test accuracy: %.2f%%' % (scores[1]*100))
model_lstm_saved.get_config()
#visualizing model performance - loss and accuracy
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Monitoring the performance of the LSTM model')
ax1.plot(model_lstm_history.history['loss'], label = 'Train')
ax1.plot(model_lstm_history.history['val_loss'], label = 'Test')
ax1.set_title('Model Loss')
ax1.legend(['Train', 'Test'])
ax2.plot(model_lstm_history.history['accuracy'], label = 'Train')
ax2.plot(model_lstm_history.history['val_accuracy'], label = 'Test')
ax2.set_title('Model Accuracy')
ax2.legend(['Train', 'Test'])
plt.show()
BI-LSTM
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
bi_lstm=Bidirectional(LSTM(128))(embed)
drop=Dropout(0.3)(bi_lstm)
dense =Dense(100,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_bi_lstm = Model(input_layer,out)
model_bi_lstm.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model_bi_lstm.summary()
tf.keras.utils.plot_model(model_bi_lstm, show_shapes = True)
es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 1, patience = 5)
mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
batch_size = 100
epochs = 10
model_bi_lstm_history = model_bi_lstm.fit(x_train,ytrain,batch_size=batch_size, epochs=epochs,
callbacks=[es,mc,lr_reduction], validation_data = (x_test, ytest),verbose=True)
# Evaluate the model & report accuracy
from keras.models import load_model
model_bi_lstm_saved = load_model("model-009-0.935592.h5")
scores = model_bi_lstm_saved.evaluate(x_test, ytest, batch_size = 100, verbose = 1)
print('Test accuracy: %.2f%%' % (scores[1]*100))
#visualizing model performance - loss and accuracy
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Monitoring the performance of the Bi-LSTM model')
ax1.plot(model_bi_lstm_history.history['loss'], label = 'Train')
ax1.plot(model_bi_lstm_history.history['val_loss'], label = 'Test')
ax1.set_title('Model Loss')
ax1.legend(['Train', 'Test'])
ax2.plot(model_bi_lstm_history.history['accuracy'], label = 'Train')
ax2.plot(model_bi_lstm_history.history['val_accuracy'], label = 'Test')
ax2.set_title('Model Accuracy')
ax2.legend(['Train', 'Test'])
plt.show()
Recurrent Neural Networks (RNN) RNN assigns more weights to the previous data points of sequence. Therefore, this technique is a powerful method for text, string and sequential data classification. Moreover, this technique could be used for image classification as we did in this work. In RNN, the neural net considers the information of previous nodes in a very sophisticated method which allows for better semantic analysis of the structures in the dataset.
Gated Recurrent Unit (GRU)
Gated Recurrent Unit (GRU) is a gating mechanism for RNN which was introduced by J. Chung et al. and K.Cho et al.. GRU is a simplified variant of the LSTM architecture, but there are differences as follows: GRU contains two gates and does not possess any internal memory. Second, non-linearity is not applied.
from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed, Activation
from keras.layers import Flatten, Permute, merge, Input
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, Dense, multiply, concatenate, Dropout
from keras.layers import GRU, Bidirectional
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer) #weights=[embedding_matrix]
gru= GRU(128)(embed)
drop=Dropout(0.3)(gru)
dense =Dense(100,activation='relu')(drop)
out=Dense(len((pd.Series(y_train)).unique()),activation='softmax')(dense)
model_GRU = Model(input_layer,out)
model_GRU.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
model_GRU.summary()
tf.keras.utils.plot_model(model_GRU, show_shapes = True)
es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 1, patience = 5)
mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
batch_size = 100
epochs = 10
model_GRU_history = model_GRU.fit(x_train,ytrain,batch_size=batch_size, epochs=epochs,
callbacks=[es,mc,lr_reduction], validation_data = (x_test, ytest),verbose=True)
# Evaluate the model & report accuracy
from keras.models import load_model
model_GRU_saved = load_model("model-009-0.939885.h5")
scores = model_GRU_saved.evaluate(x_test, ytest, batch_size = 100, verbose = 1)
print('Test accuracy: %.2f%%' % (scores[1]*100))
model_GRU_saved.get_config()
#visualizing model performance - loss and accuracy
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 7.2))
f.suptitle('Monitoring the performance of the GRU model')
ax1.plot(model_GRU_history.history['loss'], label = 'Train')
ax1.plot(model_GRU_history.history['val_loss'], label = 'Test')
ax1.set_title('Model Loss')
ax1.legend(['Train', 'Test'])
ax2.plot(model_GRU_history.history['accuracy'], label = 'Train')
ax2.plot(model_GRU_history.history['val_accuracy'], label = 'Test')
ax2.set_title('Model Accuracy')
ax2.legend(['Train', 'Test'])
plt.show()
result_ML_final = result_ML_US[['accuracy_training','accuracy_test']]
result_ML_final
result_ML_final.to_excel('result_ML_final.xlsx')
accuracy_training_LSTM = (model_lstm_saved.evaluate(x_train,ytrain)[1])*100
accuracy_test_LSTM = (model_lstm_saved.evaluate(x_test, ytest)[1])*100
accuracy_training_BiLSTM = (model_bi_lstm_saved.evaluate(x_train,ytrain)[1])*100
accuracy_test_BiLSTM = (model_bi_lstm_saved.evaluate(x_test, ytest)[1])*100
accuracy_training_GRU= (model_GRU_saved.evaluate(x_train,ytrain)[1])*100
accuracy_test_GRU = (model_GRU_saved.evaluate(x_test, ytest)[1])*100
Result_DL = {'Model' : ['LSTM', 'Bi-LSTM','GRU'],
'Accuracy score training' :[accuracy_training_LSTM, accuracy_training_BiLSTM, accuracy_training_GRU],
'Accuracy score Test': [accuracy_test_LSTM, accuracy_test_BiLSTM, accuracy_test_GRU]}
Result_DL= pd.DataFrame(Result_DL)
Result_DL
12.4 HyperTuning Deep Learning Models
!pip install hyperas
!pip install keras-tuner
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras import layers
import hyperas
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe
from tensorflow import keras
from kerastuner import BayesianOptimization
import keras_tuner as kt
import random
Hyper Parameter Tuning for LSTM
def neural_network(num_neurons=100,act='relu',
dropout=0.3,num_class=num_class,maxlen=maxlen,num_words=num_words):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer)
lstm= LSTM(128)(embed)
drop=Dropout(dropout)(lstm)
dense =Dense(num_neurons,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_lstm= Model(input_layer,out)
model_lstm.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
return model_lstm
model_lstm = KerasClassifier(build_fn=neural_network,verbose=0)
batch_size = [100]
epochs = [10]
num_neurons = [50,100]
dropout = [0.3]
param_grid = dict(batch_size=batch_size,epochs=epochs,
num_neurons=num_neurons,
dropout=dropout
)
grid = GridSearchCV(estimator=model_lstm,param_grid =param_grid,cv=5,n_jobs=-1)
grid_result_lstm = grid.fit(x_train,ytrain)
grid_result_lstm.best_params_
print('Best score LSTM GridSearchCV: ',grid_result_lstm.best_score_)
print('Best param LSTM GridSearchCV: ',grid_result_lstm.best_params_)
print('Execution time LSTM GridSearchCV: ',grid_result_lstm.refit_time_)
Hyper Parameter Tuning for Bi-LSTM
def neural_network(num_neurons=100,act='relu',
dropout=0.3,num_class=num_class,maxlen=maxlen,num_words=num_words):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer)
bi_lstm= Bidirectional(LSTM(128))(embed)
drop=Dropout(dropout)(bi_lstm)
dense =Dense(num_neurons,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_bi_lstm= Model(input_layer,out)
model_bi_lstm.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
return model_bi_lstm
model_bi_lstm = KerasClassifier(build_fn=neural_network,verbose=0)
batch_size = [100]
epochs = [10]
num_neurons = [50,100]
dropout = [0.3]
param_grid = dict(batch_size=batch_size,epochs=epochs,
num_neurons=num_neurons,
dropout=dropout
)
grid = GridSearchCV(estimator=model_bi_lstm,param_grid = param_grid,cv=5,n_jobs=-1)
#es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 0, patience = 5)
#mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=0, monitor='val_accuracy',save_best_only=True, mode='auto')
#lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
grid_result_bi_lstm = grid.fit(x_train,ytrain)
grid_result_bi_lstm.best_params_
print('Best score Bi-LSTM GridSerach: ',grid_result_bi_lstm.best_score_)
print('Best params Bi-LSTM GridSerach: ',grid_result_bi_lstm.best_params_)
print('Execution time Bi-LSTM GridSerach: ',grid_result_bi_lstm.refit_time_)
Hyper Parameter Tuning for GRU
def neural_network(num_neurons=100,act='relu',
dropout=0.3,num_class=num_class,maxlen=maxlen,num_words=num_words):
input_layer = Input(shape=(maxlen,),dtype=tf.int64)
embed = Embedding(num_words,output_dim=200,input_length=maxlen,weights=[embedding_matrix], trainable=True)(input_layer)
gru= GRU(128)(embed)
drop=Dropout(dropout)(gru)
dense =Dense(num_neurons,activation='relu')(drop)
out=Dense(num_class,activation='softmax')(dense)
model_gru= Model(input_layer,out)
model_gru.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy'])
return model_gru
model_gru = KerasClassifier(build_fn=neural_network,verbose=0)
batch_size = [100]
epochs = [10]
num_neurons = [50,100]
dropout = [0.3]
param_grid = dict(batch_size=batch_size,epochs=epochs,
num_neurons=num_neurons,
dropout=dropout
)
from sklearn.model_selection import RandomizedSearchCV
grid = GridSearchCV(estimator=model_gru,param_grid=param_grid,cv=5,n_jobs=-1)
#es = EarlyStopping(monitor='val_accuracy', mode = 'auto', verbose = 0, patience = 5)
#mc = ModelCheckpoint('model-{epoch:03d}-{val_accuracy:03f}.h5', verbose=0, monitor='val_accuracy',save_best_only=True, mode='auto')
#lr_reduction = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience=2, min_lr=0.0001)
grid_result_gru = grid.fit(x_train,ytrain)
grid_result_gru.best_params_
print('Best score GRU GridSearchCV: ',grid_result_gru.best_score_)
print('Best params GRU GridSearchCV: ',grid_result_gru.best_params_)
print('Execution time GRU GridSearchCV: ',grid_result_gru.refit_time_)
result_ML_DL_final = pd.read_excel('/content/sample_data/result_ML_DL_final.xlsx')
result_ML_DL_final
print('Best score LSTM GridSearchCV: ',(grid_result_lstm.best_score_)*100)
print('Best score Bi-LSTM GridSerach: ',(grid_result_bi_lstm.best_score_)*100)
print('Best score GRU GridSearchCV: ',(grid_result_gru.best_score_)*100)
print("Best score Random Forest GridSearchCV: ", ( RF_CV_Fit.best_score_)*100)
The GRU model gives a better accuracy among Deep Learning models through Grid Search CV at 92.16%.
The Random Forest model gives highest accuracy for Machine Learning models and all models at 94.03%
We have also seen Random Forest has higher accuracy, precision and recall in the first cut among all models.
RF_CV_Fit.best_params_
# Create training and test datasets with 80:20 ratio without augmenatation
X_train, X_test, y_train, y_test = train_test_split(dataset_ML_upsampled.combined_description,
dataset_ML_upsampled.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, y_train.shape)
print('\033[1mShape of the test set:\033[0m', X_test.shape, y_test.shape)
rf_tuned = RandomForestClassifier(bootstrap= True, max_depth=None, max_features='auto',
min_samples_leaf=1, n_estimators=100)
result = {} # Create an empty dictionary to later use to store metrics of each of the models
for model, name in zip([rf_tuned],
['Random Forest - tuned ']):
result[name] = fit_n_print(model,X_train, X_test, y_train, y_test)
result_RF_tuned= pd.DataFrame(np.array(list(result.values()))[:,:-1], # make a dataframe out of the metrics from result dictionary
columns= ['accuracy_training','accuracy_test',
'recallscore_training', 'recallscore_test',
'precision_training','precision_test',
'f1score_training', 'f1score_test',
'Elapsed'],
index= result.keys()) # use the model names as index
result_RF_tuned.index.name = 'Model' # name the index of the result1 dataframe as 'Model'
result_RF_tuned
The objective of the capstone project, the goal is to build a classifier that can classify the tickets by analyzing text
The Random Forest gives not only better accuracy, but also, better recall, precision.
Hence, proceeding with Random Forest model.
LOADING RANDOM FOREST MODEL
# Pickling and loading the model Random Forest Model
#save the model to disk
filename = 'finalized_model.sav'
pickle.dump(rf_tuned, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
%%shell
jupyter nbconvert --to html /PATH/TO/YOUR/GL_DecA_G4_NLP1_Final_With_MLandDL.ipynb